In [2]:
import numpy as np
import MyML.EAC.full as myEacFull
import MyML.EAC.sparse as myEacSp
import MyML.EAC.eac as myEac
import MyML.EAC.eac_new as myNewEac
import MyML.cluster.K_Means3 as myKM
import MyML.helper.partition as myEacPart
import MyML.metrics.accuracy as myAcc

import matplotlib.pyplot as plt
%matplotlib inline


/home/chiroptera/anaconda/lib/python2.7/site-packages/numba/cuda/decorators.py:106: UserWarning: autojit is deprecated and will be removed in a future release. Use jit instead.
  warn('autojit is deprecated and will be removed in a future release. Use jit instead.')

In [2]:
reload(myEacFull)
reload(myEacPart)
reload(myEacSp)
reload(myEac)
reload(myNewEac)


Out[2]:
<module 'MyML.EAC.eac_new' from '/home/chiroptera/workspace/QCThesis/MyML/EAC/eac_new.pyc'>

In [3]:
# rules for picking kmin kmax 
def rule1(n):
    """sqrt"""
    k = [np.sqrt(n)/2, np.sqrt(n)]
    k = map(np.ceil,k)
    k = map(int, k)
    return k

def rule2(n):
    """2sqrt"""
    k =  map(lambda x:x*2,rule1(n))
    return k

def rule3(n, sk, th):
    """fixed s/k"""
    k = [n * 1.0 / sk, th * n * 1.0 / sk]
    k = map(np.ceil,k)
    k = map(int, k)
    return k

def rule4(n):
    """sk=sqrt/2,th=30%"""
    return rule3(n, sk1(n), 1.3)

def rule5(n):
    """sk=300,th=30%"""
    return rule3(n,300, 1.3)

# rules for picking number of samples per cluster
def sk1(n):
    """sqrt/2"""
    return int(np.sqrt(n) / 2)

In [4]:
from sklearn.datasets import make_blobs

n_samples = 100000
data,gt = make_blobs(n_samples, centers=4)
%time data = data.astype(np.float32)
n_clusts = rule4(n_samples)

plt.plot(data[:,0], data[:,1], '.')

generator = myKM.K_Means()
generator._MAX_THREADS_BLOCK = 256
%time ensemble = myEacPart.generateEnsemble(data, generator, n_clusts,npartitions=100)


CPU times: user 857 µs, sys: 0 ns, total: 857 µs
Wall time: 868 µs
CPU times: user 3min 7s, sys: 20.1 s, total: 3min 27s
Wall time: 3min 28s

In [7]:
myEacSp._compute_max_assocs_from_ensemble(ensemble)*3


Out[7]:
1422

In [8]:
%time fullEAC = myNewEac.EAC(n_samples, sparse=True, condensed=False, sparse_keep_degree=True)
fullEAC.sp_max_assocs_mode = "constant"
%time fullEAC.buildMatrix(ensemble)

%time spEAC = myNewEac.EAC(n_samples, sparse=True, condensed=True, sparse_keep_degree=True)
spEAC.sp_max_assocs_mode = "constant"
%time spEAC.buildMatrix(ensemble)


CPU times: user 30 µs, sys: 3 µs, total: 33 µs
Wall time: 45.1 µs
CPU times: user 3min 34s, sys: 1 s, total: 3min 35s
Wall time: 3min 36s
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 26 µs
CPU times: user 1min 48s, sys: 718 ms, total: 1min 49s
Wall time: 2min 5s

In [11]:
import seaborn as sns
sns.set_style("darkgrid")


/home/chiroptera/anaconda/lib/python2.7/site-packages/numexpr/cpuinfo.py:42: UserWarning: [Errno 12] Cannot allocate memory
  warnings.warn(str(e), UserWarning, stacklevel=stacklevel)

In [12]:
sns.set_palette(sns.color_palette("deep", 6))

In [43]:
fig1 = plt.figure(figsize=(16,6))

ax = fig1.add_subplot(1,2,1)
x = np.arange(n_samples)
y = spEAC.coassoc.degree[:-1]

ax.plot(y,x, 'k', alpha=1)
ax.fill_between(y,0,x,color='k', alpha=1)
ax.plot([spEAC.sp_max_assocs, spEAC.sp_max_assocs], [0, n_samples], 'r', label="max. number of assocs.")
ax.set_title("Number of associations per sample on a condensed matrix")
ax.set_ylabel("Sample no.")
ax.set_xlabel("Number of associations")
plt.gca().invert_yaxis()
ax.legend(loc="best")

ax = fig1.add_subplot(1,2,2)
x = np.arange(n_samples)
y = fullEAC.coassoc.degree[:-1]

ax.plot(y,x, 'k', alpha=1)
ax.fill_between(y,0,x,color='k', alpha=1)
ax.plot([spEAC.sp_max_assocs, spEAC.sp_max_assocs], [0, n_samples], 'r', label="max. number of assocs.")
ax.set_title("Number of associations per sample on a condensed matrix")
ax.set_ylabel("Sample no.")
ax.set_xlabel("Number of associations")
plt.gca().invert_yaxis()
ax.legend(loc="best")


Out[43]:
<matplotlib.legend.Legend at 0x7f408a23d8d0>

In [45]:
fig_cond = plt.figure()
x = np.arange(n_samples)
y = spEAC.coassoc.degree[:-1]

plt.barh(x, y)
plt.plot([spEAC.sp_max_assocs, spEAC.sp_max_assocs], [0, n_samples], 'r', label="max. number of assocs.")
plt.title("Number of associations per sample on a condensed matrix")
plt.ylabel("Sample no.")
plt.xlabel("Number of associations")
plt.gca().invert_yaxis()
plt.legend(loc="best")


Out[45]:
<matplotlib.legend.Legend at 0x7f0a459edb10>

In [13]:
from MyML.helper.plotting import save_fig

In [47]:
save_fig(fig_cond, "/home/chiroptera/eac_csr_cond_degree")
save_fig(fig_full, "/home/chiroptera/eac_csr_full_degree")

In [46]:
fig_full = plt.figure()
x = np.arange(n_samples)
y = fullEAC.coassoc.degree[:-1]

plt.barh(x, y)
plt.plot([spEAC.sp_max_assocs, spEAC.sp_max_assocs], [0, n_samples], 'r', label="max. number of assocs.")
plt.title("Number of associations per sample on a complete matrix")
plt.ylabel("Sample no.")
plt.xlabel("Number of associations")
plt.gca().invert_yaxis()
plt.legend(loc="best")


Out[46]:
<matplotlib.legend.Legend at 0x7f0a3ec75990>

In [7]:
%time fullEAC.finalClustering(n_clusters=0) 
%time spEAC.finalClustering(n_clusters=0)
# b MyML/EAC/eac_new.py:606
# b MyML/EAC/eac_new.py:556


CPU times: user 419 ms, sys: 0 ns, total: 419 ms
Wall time: 419 ms
CPU times: user 300 ms, sys: 12 ms, total: 312 ms
Wall time: 311 ms
Out[7]:
array([0, 1, 2, ..., 3, 3, 2], dtype=int32)

In [100]:
histFig = plt.figure(figsize=(16,6))

ax = histFig.add_subplot(1,2,1)
ax.hist(fullEAC.labels,bins=fullEAC.n_fclusts)
ax.set_title("Full coassoc")
print "full bincount", np.bincount(fullEAC.labels)

ax = histFig.add_subplot(1,2,2)
ax.hist(spEAC.labels,bins=spEAC.n_fclusts)
ax.set_title("Sparse coassoc")
print "full bincount", np.bincount(spEAC.labels)


full bincount [1250 1250 2500]
full bincount [1250 2500 1250]

In [101]:
acc = myAcc.HungarianIndex(n_samples)
acc.score(gt, fullEAC.labels)
acc.accuracy


Out[101]:
0.75

In [102]:
acc = myAcc.HungarianIndex(n_samples)
acc.score(gt, spEAC.labels)
acc.accuracy


Out[102]:
0.0

In [103]:
acc2 = myAcc.ConsistencyIndex(N=n_samples)
acc2.score(gt, fullEAC.labels)


Out[103]:
0.75

In [104]:
acc2 = myAcc.ConsistencyIndex(N=n_samples)
acc2.score(gt, spEAC.labels)


Out[104]:
0.75

In [36]:
for c in xrange(fullEAC.n_fclusts):
    c_idx = fullEAC.labels == c
    plt.plot(data[c_idx,0], data[c_idx,1], '.')



In [37]:
for c in xrange(spEAC.n_fclusts):
    c_idx = spEAC.labels == c
    plt.plot(data[c_idx,0], data[c_idx,1], '.')



In [20]:
(fullEAC.coassoc.todense() == (spEAC.coassoc.todense() + spEAC.coassoc.todense().T)).all()


Out[20]:
True

In [14]:
%time full = myEacFull.EAC_FULL(n_samples, condensed=False)
%time full.update_ensemble(ensemble)


CPU times: user 1.57 ms, sys: 70 µs, total: 1.64 ms
Wall time: 872 µs
CPU times: user 274 ms, sys: 3.84 ms, total: 278 ms
Wall time: 279 ms

In [15]:
%time cfull = myEacFull.EAC_FULL(n_samples, condensed=True)
%time cfull.update_ensemble(ensemble)


CPU times: user 683 µs, sys: 30 µs, total: 713 µs
Wall time: 472 µs
CPU times: user 69.2 ms, sys: 23 µs, total: 69.2 ms
Wall time: 67.1 ms

In [16]:
(full.coassoc == cfull.todense() + ).all()


Out[16]:
True

In [17]:
max_assocs = myEacSp._compute_max_assocs_from_ensemble(ensemble) * 3
%time sp = myEacSp.EAC_CSR(n_samples, max_assocs, condensed=False, sort_mode="numpy")
%time sp.update_ensemble(ensemble)


CPU times: user 141 µs, sys: 6 µs, total: 147 µs
Wall time: 131 µs
CPU times: user 3.28 s, sys: 11.3 ms, total: 3.29 s
Wall time: 3.28 s

In [18]:
max_assocs = myEacSp._compute_max_assocs_from_ensemble(ensemble) * 3
%time sp2 = myEacSp.EAC_CSR(n_samples, max_assocs, condensed=False, sort_mode="surgical")
%time sp2.update_ensemble(ensemble)


CPU times: user 146 µs, sys: 6 µs, total: 152 µs
Wall time: 136 µs
CPU times: user 1.24 s, sys: 16.2 ms, total: 1.26 s
Wall time: 1.24 s

In [21]:
sp._condense(keep_degree=True)
sp2._condense(keep_degree=True)
print (sp.todense() == sp2.todense()).all()
print (sp.todense() == full.coassoc).all()


True
True

In [22]:
max_assocs = myEacSp._compute_max_assocs_from_ensemble(ensemble) * 3
%time csp = myEacSp.EAC_CSR(n_samples, max_assocs, condensed=True, sort_mode="numpy")
%time csp.update_ensemble(ensemble)


CPU times: user 269 µs, sys: 10 µs, total: 279 µs
Wall time: 223 µs
CPU times: user 2.31 s, sys: 3.76 ms, total: 2.31 s
Wall time: 2.3 s

In [23]:
max_assocs = myEacSp._compute_max_assocs_from_ensemble(ensemble) * 3
%time csp2 = myEacSp.EAC_CSR(n_samples, max_assocs, condensed=True, sort_mode="surgical")
%time csp2.update_ensemble(ensemble)


CPU times: user 140 µs, sys: 4 µs, total: 144 µs
Wall time: 131 µs
CPU times: user 513 ms, sys: 8.18 ms, total: 521 ms
Wall time: 510 ms

In [24]:
csp._condense(keep_degree=True)
csp2._condense(keep_degree=True)

print "condensed numpy", ((csp.todense() + csp.todense().T) == full.coassoc).all()
print "condensed surgical", ((csp2.todense() + csp2.todense().T) == full.coassoc).all()


condensed numpy True
condensed surgical True

In [25]:
max_assocs = myEacSp._compute_max_assocs_from_ensemble(ensemble) * 3
%time csp2 = myEacSp.EAC_CSR(n_samples, max_assocs, max_assocs_type='linear', condensed=True, sort_mode="surgical")
%time csp2.update_ensemble(ensemble)


CPU times: user 0 ns, sys: 3.91 ms, total: 3.91 ms
Wall time: 270 µs
CPU times: user 455 ms, sys: 4.12 ms, total: 459 ms
Wall time: 455 ms

In [26]:
csp2._condense(keep_degree=True)

In [27]:
print "condensed surgical", ((csp2.todense() + csp2.todense().T) == full.coassoc).all()


condensed surgical True

Linkage sparse


In [4]:
import scipy.sparse.csgraph as csgraph

In [65]:
reload(csgraph)


Out[65]:
<module 'scipy.sparse.csgraph' from '/home/chiroptera/anaconda/lib/python2.7/site-packages/scipy/sparse/csgraph/__init__.pyc'>

In [202]:
mst.indptr


Out[202]:
array([   0,    2,    4, ..., 1997, 1997, 1997], dtype=int32)

In [54]:
from scipy.sparse.csgraph._validation import validate_graph

In [63]:
c2 = validate_graph(coassoc, True, dtype=coassoc.dtype, dense_output=False)

In [64]:
c2


Out[64]:
<5000x5000 sparse matrix of type '<type 'numpy.float64'>'
	with 2091859 stored elements in Compressed Sparse Row format>

In [13]:
import scipy_numba.sparse.csgraph._min_spanning_tree as myMST

In [53]:
reload(myMST)


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-53-1145d448f1ad> in <module>()
----> 1 reload(myMST)

/home/chiroptera/workspace/scipy_numba/scipy_numba/sparse/csgraph/_min_spanning_tree.py in <module>()
     10 from scipy.sparse import csr_matrix, isspmatrix_csc, isspmatrix
     11 from scipy.sparse.csgraph._validation import validate_graph
---> 12 from scipy_numba.sparse.csgraph._validation import validate_graph
     13 
     14 #include 'parameters.pxi'

/home/chiroptera/workspace/scipy_numba/scipy_numba/sparse/csgraph/_validation.py in <module>()
      3 import numpy as np
      4 from scipy.sparse import csr_matrix, isspmatrix, isspmatrix_csc
----> 5 from ._tools import csgraph_to_dense, csgraph_from_dense,\
      6     csgraph_masked_from_dense, csgraph_from_masked
      7 

ImportError: No module named _tools

In [23]:
coassoc2 = spEAC.coassoc.csr.copy()
coassoc2.data = coassoc2.data.max() + 1 - coassoc2.data

%time mst2 = myMST.minimum_spanning_tree(coassoc2)


CPU times: user 340 ms, sys: 7.97 ms, total: 348 ms
Wall time: 347 ms

In [39]:
coassoc = spEAC.coassoc.csr.copy()
coassoc.data = coassoc.data.max() + 1 - coassoc.data

In [24]:
coassoc = spEAC.coassoc.csr.copy()
coassoc.data = coassoc.data.max() + 1 - coassoc.data

%time mst = csgraph._min_spanning_tree.minimum_spanning_tree(coassoc)


CPU times: user 381 ms, sys: 7.59 ms, total: 389 ms
Wall time: 389 ms

In [27]:
mst = mst.astype(mst2.dtype)

In [42]:
coassoc_val = csgraph._validation.validate_graph(coassoc, True, dense_output=False)

In [52]:
coassoc_val.data


Out[52]:
array([  60.,   99.,  100., ...,  100.,   86.,   81.])

In [51]:
coassoc.data[0]=95

In [46]:
coassoc_val.indices


Out[46]:
array([  59,   13,   24, ..., 4998, 4998, 4999], dtype=int32)

In [36]:
a = csgraph.connected_components(mst)

In [35]:
fig = plt.figure(figsize=(16,6))
ax = fig.add_subplot(1,2,1)
plt.hist(mst.data, bins=mst.max())
ax = fig.add_subplot(1,2,2)
plt.hist(mst2.data, bins=mst2.max())


Out[35]:
(array([ 142.,  276.,  374.,  442.,  458.,  504.,  478.,  472.,  408.,
         358.,  290.,  265.,  189.,  148.,   85.,   38.,   26.,   20.,
          10.,    4.,    4.,    0.,    0.,    2.,    1.,    1.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    1.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    1.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    1.,    0.,    0.,    0.,    0.,    0.,    1.]),
 array([   1.  ,    1.99,    2.98,    3.97,    4.96,    5.95,    6.94,
           7.93,    8.92,    9.91,   10.9 ,   11.89,   12.88,   13.87,
          14.86,   15.85,   16.84,   17.83,   18.82,   19.81,   20.8 ,
          21.79,   22.78,   23.77,   24.76,   25.75,   26.74,   27.73,
          28.72,   29.71,   30.7 ,   31.69,   32.68,   33.67,   34.66,
          35.65,   36.64,   37.63,   38.62,   39.61,   40.6 ,   41.59,
          42.58,   43.57,   44.56,   45.55,   46.54,   47.53,   48.52,
          49.51,   50.5 ,   51.49,   52.48,   53.47,   54.46,   55.45,
          56.44,   57.43,   58.42,   59.41,   60.4 ,   61.39,   62.38,
          63.37,   64.36,   65.35,   66.34,   67.33,   68.32,   69.31,
          70.3 ,   71.29,   72.28,   73.27,   74.26,   75.25,   76.24,
          77.23,   78.22,   79.21,   80.2 ,   81.19,   82.18,   83.17,
          84.16,   85.15,   86.14,   87.13,   88.12,   89.11,   90.1 ,
          91.09,   92.08,   93.07,   94.06,   95.05,   96.04,   97.03,
          98.02,   99.01,  100.  ]),
 <a list of 100 Patch objects>)

In [131]:
# inputs
n_clusters = 4
n_partitions = 100

In [132]:
# converting to diassociations
coassoc.data = coassoc.data.max() + 1 - coassoc.data

# get minimum spanning tree
mst = csgraph.minimum_spanning_tree(coassoc).astype(np.uint8)

# compute number of disconnected components
n_disconnect_clusters = mst.shape[0] - mst.nnz

# sort associations by weights
asort = mst.data.argsort()
sorted_weights = mst.data[asort]

if n_clusters == 0:
    # compute lifetimes
    lifetimes = sorted_weights[1:] - sorted_weights[:-1]

    # add 1 to n_partitions as the maximum weight because I also added
    # 1 when converting to diassoc to avoid having zero weights
    disconnect_lifetime = n_partitions + 1 - sorted_weights[-1]

    # maximum lifetime
    m_index = np.argmax(lifetimes)
    th = lifetimes[m_index]

    # get number of clusters from lifetimes
    indices = np.where(mst.data > mst.data[m_index])[0]
    if indices.size == 0:
        cont = 1
    else:
        cont = indices.size + 1

    #testing the situation when only 1 cluster is present
    # if maximum lifetime is smaller than 2 times the minimum
    # don't make any cuts (= 1 cluster)
    # max>2*min_interval -> nc=1
    close_to_zero_indices = np.where(np.isclose(lifetimes, 0))
    minimum = np.min(lifetimes[close_to_zero_indices])

    if th < 2 * minimum:
        cont = 1

    # add disconnected clusters to number of clusters if disconnected
    # lifetime is smaller
    if th > disconnect_lifetime:
        cont += n_disconnect_clusters
    else:
        cont = n_disconnect_clusters

    nc_stable = cont
else:
    nc_stable = n_clusters

  
    
n_comps, labels = csgraph.connected_components(mst)

In [133]:
print n_comps, labels
plt.hist(labels, bins=n_comps, log=True)


4 [0 1 0 ..., 2 1 1]
Out[133]:
(array([ 667.,  666.,  666.,    1.]),
 array([ 0.  ,  0.75,  1.5 ,  2.25,  3.  ]),
 <a list of 4 Patch objects>)

In [134]:
for c in xrange(n_comps):
    c_idx = labels == c
    plt.plot(data[c_idx,0],data[c_idx,1],'.')



In [33]:
nc_stable


Out[33]:
3

In [66]:
nc_stable


Out[66]:
2

In [ ]:
if n_clusters == 0:
            # lifetime is here computed as the distance difference between 
            # any two consecutive nodes, i.e. the distance between passing
            # from n to n-1 clusters

            lifetimes = Z[1:,2] - Z[:-1,2]

            m_index = np.argmax(lifetimes)

            # Z is ordered in increasing order by weight of connection
            # the connection whose weight is higher than the one specified
            # by m_index MUST be the one from which the jump originated the
            # maximum lifetime; all connections above that (and including)
            # will be removed for the final clustering
            indices = np.where(Z[:,2] > Z[m_index, 2])[0]
            #indices = np.arange(m_index+1, Z.shape[0])
            if indices.size == 0:
                cont = 1
            else:
                cont = indices.size + 1

            # store maximum lifetime
            th = lifetimes[m_index]

            #testing the situation when only 1 cluster is present
            # if maximum lifetime is smaller than 2 times the minimum
            # don't make any cuts (= 1 cluster)
            #max>2*min_interval -> nc=1
            close_to_zero_indices = np.where(np.isclose(lifetimes, 0))
            minimum = np.min(lifetimes[close_to_zero_indices])

            if th < 2 * minimum:
                cont = 1

            nc_stable = cont

        else:
            nc_stable = n_clusters

        if nc_stable > 1:
            # only the labels are of interest

            labels = labels_from_Z(Z, n_clusters=nc_stable)

            # rename labels
            i=0
            for l in np.unique(labels):
                labels[labels == l] = i
                i += 1
        else:
            labels = np.zeros(self.n_samples, dtype = np.int32)

        self.labels_ = labels
        return labels

linkage full


In [159]:
from scipy.cluster.hierarchy import linkage

In [204]:
coassoc = cfull.coassoc.copy()

In [207]:
myEac.make_diassoc_1d(coassoc, n_partitions)

# apply linkage
%time Z = myEac.linkage(coassoc, method="average")


CPU times: user 3.01 s, sys: 9.12 ms, total: 3.02 s
Wall time: 3.03 s

In [208]:
Z


Out[208]:
array([[  3.10000000e+01,   2.06000000e+02,   1.00000000e+00,
          2.00000000e+00],
       [  2.31000000e+02,   9.47000000e+02,   1.00000000e+00,
          2.00000000e+00],
       [  2.55000000e+02,   1.66400000e+03,   1.00000000e+00,
          2.00000000e+00],
       ..., 
       [  3.98900000e+03,   3.99300000e+03,   9.96209231e+01,
          6.67000000e+02],
       [  3.99400000e+03,   3.99600000e+03,   1.01000000e+02,
          1.33400000e+03],
       [  3.99500000e+03,   3.99700000e+03,   1.01000000e+02,
          2.00000000e+03]])

In [185]:
labels = myEac.labels_from_Z(Z, n_clusters=100)
# rename labels
i=0
for l in np.unique(labels):
    labels[labels == l] = i
    i += 1

In [209]:
plt.hist(Z[:,2], bins=Z.shape[0])


Out[209]:
(array([ 35.,   0.,   0., ...,   0.,   0.,   2.]),
 array([   1.        ,    1.05002501,    1.10005003, ...,  100.89994997,
         100.94997499,  101.        ]),
 <a list of 1999 Patch objects>)

In [186]:
print labels
plt.hist(labels, bins=100, log=True)


[98 95 63 ..., 71 95 82]
Out[186]:
(array([   1.,    1.,    1.,    1.,    1.,    1.,    1.,    1.,    1.,
           1.,    1.,    1.,    1.,    1.,    1.,    1.,    1.,    1.,
           1.,    1.,    1.,    1.,    1.,    1.,    1.,    1.,    1.,
           1.,    1.,    1.,    1.,    1.,    1.,    1.,    1.,    1.,
           1.,    1.,    1.,    1.,    1.,    1.,    2.,    3.,    2.,
           3.,    3.,    2.,    2.,    2.,   12.,    2.,    2.,    4.,
           4.,    3.,    3.,    6.,    9.,    2.,    4.,    2.,   15.,
          20.,    7.,    3.,   41.,    3.,   15.,    3.,   17.,   25.,
          39.,    7.,   39.,   14.,   42.,   50.,    4.,    2.,    7.,
          51.,   45.,    7.,    5.,   14.,    7.,   49.,  260.,   51.,
          57.,   47.,   47.,   45.,    4.,  394.,   74.,    3.,  191.,  182.]),
 array([  0.  ,   0.99,   1.98,   2.97,   3.96,   4.95,   5.94,   6.93,
          7.92,   8.91,   9.9 ,  10.89,  11.88,  12.87,  13.86,  14.85,
         15.84,  16.83,  17.82,  18.81,  19.8 ,  20.79,  21.78,  22.77,
         23.76,  24.75,  25.74,  26.73,  27.72,  28.71,  29.7 ,  30.69,
         31.68,  32.67,  33.66,  34.65,  35.64,  36.63,  37.62,  38.61,
         39.6 ,  40.59,  41.58,  42.57,  43.56,  44.55,  45.54,  46.53,
         47.52,  48.51,  49.5 ,  50.49,  51.48,  52.47,  53.46,  54.45,
         55.44,  56.43,  57.42,  58.41,  59.4 ,  60.39,  61.38,  62.37,
         63.36,  64.35,  65.34,  66.33,  67.32,  68.31,  69.3 ,  70.29,
         71.28,  72.27,  73.26,  74.25,  75.24,  76.23,  77.22,  78.21,
         79.2 ,  80.19,  81.18,  82.17,  83.16,  84.15,  85.14,  86.13,
         87.12,  88.11,  89.1 ,  90.09,  91.08,  92.07,  93.06,  94.05,
         95.04,  96.03,  97.02,  98.01,  99.  ]),
 <a list of 100 Patch objects>)

plot stuff


In [14]:
fig1 = plt.figure(figsize=(16,6))

ax = fig1.add_subplot(1,2,1)
x = np.arange(n_samples)
y = fullEAC.coassoc.degree[:-1]

x_max = [0, n_samples]
y_max = [fullEAC.sp_max_assocs, fullEAC.sp_max_assocs]

ax.plot(x, y, 'k', alpha=1)
ax.fill_between(x, 0, y, color='k', alpha=1)
ax.plot(x_max, y_max, 'r', label="max. number of assocs.")

ax.set_title("Number of associations per sample on a condensed matrix")
ax.set_xlabel("Sample no.")
ax.set_ylabel("Number of associations")
#ax.legend(loc=4)

################################################

ax = fig1.add_subplot(1,2,2)
x = np.arange(n_samples)
y = spEAC.coassoc.degree[:-1]

lin_indptr = myEacSp.indptr_linear(n_samples, fullEAC.sp_max_assocs,
                                   spEAC.coassoc.n_s, spEAC.coassoc.n_e,
                                   spEAC.coassoc.val_s, spEAC.coassoc.val_e)
y_lin = lin_indptr[1:] - lin_indptr[:-1]

fit = np.polyfit(x,y,1) # linear regression of degree
fit_fn = np.poly1d(fit)
y_reg = fit_fn(x)
y_reg = np.where(y_reg<0,0,y_reg)

ax.plot(x, y, color='k')
ax.fill_between(x, 0, y, color='k', alpha=1)
ax.plot(x, y_reg, label="linear regression of no. of assocs.")
ax.plot(x, y_lin, label="pre-allocated no. assocs.")
ax.plot(x_max, y_max, label="max. number of assocs.")

ax.set_title("Number of associations per sample on a condensed matrix")
ax.set_xlabel("Sample no.")
ax.set_ylabel("Number of associations")
ax.legend(loc=[0.5,0.7])


Out[14]:
<matplotlib.legend.Legend at 0x7f86f21b71d0>

In [15]:
save_fig(fig1, "/home/chiroptera/eac_csr_cond_full_degree_100k")

In [54]:
x = np.arange(1,10,0.05)
y = 1 / np.sqrt(x)
y = y.max() -y
y = y / y.max()
plt.plot(x,y)
plt.plot(x,y[::-1])
#plt.plot(x,np.sqrt(x)[::-1]/np.sqrt(x).max())


Out[54]:
[<matplotlib.lines.Line2D at 0x7ff763e48710>]

In [52]:
y.sum()


Out[52]:
136.40980611903461

In [15]:
x = np.arange(0,2,0.05)
y = np.exp(x)
y /= y.max()
y = 1-y
plt.plot(x,y)


/home/chiroptera/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:2: RuntimeWarning: divide by zero encountered in log
  from IPython.kernel.zmq import kernelapp as app
Out[15]:
[<matplotlib.lines.Line2D at 0x7ff76b6e5c90>]

In [57]:
reload(myEacSp)


Out[57]:
<module 'MyML.utils.sparse' from '/home/chiroptera/workspace/QCThesis/MyML/utils/sparse.pyc'>

In [60]:
y, y_sum = myEacSp.linear(n_samples, max_assocs, 0.1, 1, 1, 0.02)

In [61]:
total_area = n_samples * max_assocs
y_sum * 1.0 / total_area


Out[61]:
0.55873891547049437

In [13]:
fig = plt.figure(figsize=(8,6))
x = np.arange(n_samples)
ax = fig.add_subplot(1,1,1)
ax.bar(x, csp.degree[:-1])
ax.plot([0,n_samples],[max_assocs, max_assocs])
#ax.plot(np.arange(n_samples), max_assoc_scheme)
ax.plot(x,y)

ax.set_title("full sparse matrix")
ax.set_xlabel("sample id")


Out[13]:
<matplotlib.text.Text at 0x7ff76c3cf9d0>

In [67]:
import MyML.helper.scan as myScan
reload(myScan)


Out[67]:
<module 'MyML.helper.scan' from '/home/chiroptera/workspace/QCThesis/MyML/helper/scan.py'>

In [63]:
y_c = y.copy()

In [71]:
y_sum


Out[71]:
3503293

In [72]:
y_c


Out[72]:
array([1254, 1254, 1254, ...,   25,   25, 5000], dtype=int32)

In [68]:
myScan.exprefixsumNumbaSingle(y)


Out[68]:
3508293

In [77]:
class testme:
    def __init__(self,**kwargs):
        self.val2=kwargs.get("val2", 50)

In [80]:
a=testme(val2=100)
a.val2


Out[80]:
100

In [73]:
y


Out[73]:
array([      0,    1254,    2508, ..., 3503243, 3503268, 3503293], dtype=int32)

In [76]:
((y[1:] - y[:-1]) == y_c[:-1]).all()


Out[76]:
True